Take Home Ex 3

Author

Aruiana

Published

February 5, 2023

Modified

February 14, 2023

pacman::p_load(tidyverse, plotly, crosstalk, DT, ggdist, gganimate, ggstatsplot, heatmaply)

HDB <- read_csv(("data/HDB.csv"))

#Create addtional data on price per sqm
HDB$price_per_sqm <- (HDB$resale_price / HDB$floor_area_sqm)
#Filter 3Room, 4Room, 5Room
HDBRoom <- HDB %>% filter(flat_type=="3 ROOM" | flat_type=="4 ROOM" | flat_type=="5 ROOM") %>%
  separate(month, into = c("year", "month")) %>% 
  filter(year == "2022") %>%
  separate(remaining_lease, into = c("rmlease_years", "rmlease_month"), sep = "years") 

HDBRoom$region <- case_when(
  HDBRoom$town %in% c("ANG MO KIO", "HOUGANG", "PUNGGOL", "SERANGOON", "SENGKANG") ~ "North-East",
    HDBRoom$town %in% c("BISHAN", "BUKIT MERAH", "BUKIT TIMAH", "CENTRAL AREA", "GEYLANG", "KALLANG/WHAMPOA", "MARINE PARADE", "QUEENSTOWN", "TOA PAYOH") ~ "Central",
    HDBRoom$town %in% c("BEDOK", "PASIR RIS", "TAMPINES") ~ "East",
    HDBRoom$town %in% c("SEMBAWANG", "WOODLANDS", "YISHUN") ~ "North",
    HDBRoom$town %in% c("BUKIT BATOK", "BUKIT PANJANG", "CHOA CHU KANG", "CLEMENTI", "JURONG EAST", "JURONG WEST") ~ "West")

HDBRoom$rmlease_years <- as.numeric(HDBRoom$rmlease_years)

HDBRoom$rmlease_month <- gsub("[monthsmonth]", " ", HDBRoom$rmlease_month) %>%
  as.numeric(HDBRoom$rmlease_month) / 12 

HDBRoom
# A tibble: 24,374 × 15
   year  month town       flat_t…¹ block stree…² store…³ floor…⁴ flat_…⁵ lease…⁶
   <chr> <chr> <chr>      <chr>    <chr> <chr>   <chr>     <dbl> <chr>     <dbl>
 1 2022  01    ANG MO KIO 3 ROOM   320   ANG MO… 07 TO …      73 New Ge…    1977
 2 2022  01    ANG MO KIO 3 ROOM   225   ANG MO… 07 TO …      67 New Ge…    1978
 3 2022  01    ANG MO KIO 3 ROOM   331   ANG MO… 07 TO …      68 New Ge…    1981
 4 2022  01    ANG MO KIO 3 ROOM   534   ANG MO… 07 TO …      82 New Ge…    1980
 5 2022  01    ANG MO KIO 3 ROOM   578   ANG MO… 04 TO …      67 New Ge…    1980
 6 2022  01    ANG MO KIO 3 ROOM   452   ANG MO… 01 TO …      83 New Ge…    1979
 7 2022  01    ANG MO KIO 3 ROOM   560   ANG MO… 01 TO …      67 New Ge…    1980
 8 2022  01    ANG MO KIO 3 ROOM   435   ANG MO… 04 TO …      67 New Ge…    1979
 9 2022  01    ANG MO KIO 3 ROOM   435   ANG MO… 04 TO …      67 New Ge…    1979
10 2022  01    ANG MO KIO 3 ROOM   560   ANG MO… 10 TO …      67 New Ge…    1980
# … with 24,364 more rows, 5 more variables: rmlease_years <dbl>,
#   rmlease_month <dbl>, resale_price <dbl>, price_per_sqm <dbl>, region <chr>,
#   and abbreviated variable names ¹​flat_type, ²​street_name, ³​storey_range,
#   ⁴​floor_area_sqm, ⁵​flat_model, ⁶​lease_commence_date
HDBRoom$storey_range <- factor (HDBRoom$storey_range, levels = unique(HDBRoom$storey_range))
ggplot(data = HDBRoom,
  aes(x =storey_range)) + geom_bar()

unique(HDBRoom$storey_range)
 [1] 07 TO 09 04 TO 06 01 TO 03 10 TO 12 13 TO 15 25 TO 27 16 TO 18 19 TO 21
 [9] 22 TO 24 28 TO 30 34 TO 36 31 TO 33 37 TO 39 40 TO 42 43 TO 45 49 TO 51
[17] 46 TO 48
17 Levels: 07 TO 09 04 TO 06 01 TO 03 10 TO 12 13 TO 15 25 TO 27 ... 46 TO 48
unique(HDBRoom$town)
 [1] "ANG MO KIO"      "BEDOK"           "BISHAN"          "BUKIT BATOK"    
 [5] "BUKIT MERAH"     "BUKIT PANJANG"   "BUKIT TIMAH"     "CENTRAL AREA"   
 [9] "CHOA CHU KANG"   "CLEMENTI"        "GEYLANG"         "HOUGANG"        
[13] "JURONG EAST"     "JURONG WEST"     "KALLANG/WHAMPOA" "MARINE PARADE"  
[17] "PASIR RIS"       "PUNGGOL"         "QUEENSTOWN"      "SEMBAWANG"      
[21] "SENGKANG"        "SERANGOON"       "TAMPINES"        "TOA PAYOH"      
[25] "WOODLANDS"       "YISHUN"         
HDBRoom$rmlease_month[is.na(HDBRoom$rmlease_month)] = 0

HDBRoom$rmlease <- as.numeric(HDBRoom$rmlease_years + HDBRoom$rmlease_month)

HDBRoom$storey_range <- str_replace(HDBRoom$storey_range, "TO", "-")

sr_sort = c("01 - 03", "04 - 06", "07 - 09", "10 - 12", "13 - 15", "16 - 18", "19 - 21", "22 - 24","25 - 27","28 - 30", "31 - 33", "34 - 36", "37 - 39", "40 - 42", "43 - 45", "49 - 51", "46 - 48")

HDBRoom$storey_range <- factor (HDBRoom$storey_range, levels = sr_sort)
HDBDATA <- HDBRoom [,!names(HDBRoom) %in% c("year", "block", "street_name", "rmlease_years", "rmlease_month", "flat_model")]
gghistostats(
  data = HDBDATA, x = "floor_area_sqm",
  type = "bayes",
  test.value = 100,
  xlab = "Floor Area (sqm) of property sold"
  )

ggbetweenstats(
  data = HDBDATA,
  x = flat_type, 
  y = resale_price,
  type = "np",
  messages = FALSE
)

ggscatterstats(
  data = HDBDATA,
  x = resale_price,
  y = price_per_sqm,
  marginal = FALSE,
  )

options(scipen = 999)
mean(HDBDATA$resale_price)
[1] 536391.2
min(HDBDATA$resale_price)
[1] 200000
max(HDBDATA$resale_price)
[1] 1418000
mean(HDBDATA$price_per_sqm)
[1] 5735.973
min(HDBDATA$price_per_sqm)
[1] 3333.333
max(HDBDATA$price_per_sqm)
[1] 14731.18
scdata <- highlight_key(HDBDATA) 
  
sc1 <- ggplot(data = scdata, aes(x = town, y = resale_price, fill = region)) + geom_point() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + scale_y_continuous(breaks = c(200000,500000,1000000,150000)) +
  labs(title = "Resale Price by Town", x = "Town", y = "Resale Price")

sc2 <- ggplot(data = scdata, aes(x = town, y = price_per_sqm, fill = region)) + geom_point() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + scale_y_continuous(breaks = c(3000,6000,9000,12000,15000)) +
    labs(title = "Resale Price per sqm by Town", x = "Town", y = "Resale Price/Sqm")

subplot(ggplotly(sc1), ggplotly(sc2))
HDBDATA %>%
  mutate(class = fct_reorder(town, price_per_sqm, .fun="mean")) %>%
  ggplot(aes(y =reorder(town, price_per_sqm),
           x = price_per_sqm, fill = region)) + 
  geom_boxplot() + stat_summary(fun.y=mean, geom = "point", colour="yellow")

HDBDATA %>% 
  group_by(region) %>%
  mutate(class = fct_reorder(region, price_per_sqm, .fun="mean")) %>%
  ggplot(mapping = aes(y = flat_type, x = price_per_sqm)) +
  # Make grouped boxplot
  geom_boxplot(aes(fill = as.factor(region))) +
  theme(legend.position = "top") +
  # Adjust lables and add title
  labs(title = "HDB resale prices in 2022 by region", y="Flat Type", x = "Price per square metre (SGD)", fill = "flat_type")

HDBDATA %>%
  
grouped_gghistostats(
  x                 = resale_price,
  test.value        = 50,
  type              = "nonparametric",
  grouping.var      = region,
  normal.curve      = TRUE,
  normal.curve.args = list(color = "red", size = 1),
  ggtheme           = ggthemes::theme_tufte(),
  ## modify the defaults from `{ggstatsplot}` for each plot
  plotgrid.args     = list(nrow = 2),
  annotation.args   = list(title = "Resale price by region")
)

floorheatmap <-
  HDBDATA %>%
  group_by(town, storey_range) %>%
  summarise(median_price = median(price_per_sqm))

heatmap <- ggplot(data = floorheatmap, 
                  mapping = aes(x = town, y = storey_range, fill = median_price)) +
            geom_tile() +
  labs(title = "Heatmap of HDB breakdown by area and storey", x = "Town", y = "Storey") +
  scale_fill_gradient(name = "Median Resale Price/sqm",
                      low = "peachpuff",
                      high = "deeppink4")+
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

heatmap